# data wrangling
library(plyr)
library(dplyr)
library(tidyr)
library(reshape)
library(stringr)
# maps and geography
library(rgeos)
library(sf) # World map
library(rnaturalearth)
# visualisation
library(ggplot2)
library(pheatmap)
library(RColorBrewer)
library(ggrepel)
library(cowplot)
# differential abundance analysis
library(DESeq2)
The metadata contains information about the samples, such as the sampling date, the Longhurst region, the biome, GPS locations, and measurements like the temperature, salinity, oxygen, etc.
meta <- read.csv("/Users/jegoussc/Repositories/hanoxy/data/info/metadata.csv",
sep = ',')
meta$sampling_date = as.Date(meta$sampling_date)
# head(meta)
Function to get the seasons.
date2season <- function(DATES) {
WS <- as.Date("2012-12-15", format = "%Y-%m-%d") # Winter Solstice
SE <- as.Date("2012-3-15", format = "%Y-%m-%d") # Spring Equinox
SS <- as.Date("2012-6-15", format = "%Y-%m-%d") # Summer Solstice
FE <- as.Date("2012-9-15", format = "%Y-%m-%d") # Autumn Equinox
# Convert dates from any year to 2012 dates
d <- as.Date(strftime(DATES, format="2012-%m-%d"))
ifelse (d >= WS | d < SE, "Winter",
ifelse (d >= SE & d < SS, "Spring",
ifelse (d >= SS & d < FE, "Summer", "Autumn")))
}
Add the season parameter to metadata.
meta$season = date2season(meta$sampling_date)
First, we retrieve the world map data which relies on the
rnaturalearth package by Andy
South (2017).
world_map <- rnaturalearth::ne_countries(scale = 'small', returnclass = c("sf"))
Draw the base map.
map = ggplot() +
geom_sf(data = world_map, size = .2, fill = 'white', col = 'white') +
theme(panel.grid.major = element_line(color = "grey", linetype = "dashed", size = 0.25)) +
coord_sf(expand = FALSE) +
labs(x = 'Longitude', y = 'Latitude')
map
Longhurst provinces represent a partition of the world oceans into provinces as defined by Longhurst (1995; 1998; 2006) based on the prevailing role of physical forcing as a regulator of phytoplankton distribution. Initially, these static boundaries were developed at the Bedford Institute of Oceanography, Canada.
Note that the boundaries of these provinces are not fixed in time and space, but are dynamic and move under seasonal and inter-annual changes in physical forcing.
Data set was downloaded from https://www.marineregions.org/downloads.php.
# import longhurst provinces
longhurst <- sf::read_sf("/Users/jegoussc/Repositories/haliea/INFOS/longhurst-world-v4-2010.shx")
Draw the blank world map with the Longhurst provinces.
longhurst_map = map +
geom_sf(data = longhurst, aes(fill = ProvCode),
size = .1, col = "white", alpha = .25) +
geom_sf_text(data = longhurst %>%
group_by(ProvCode), aes(label = ProvCode),
colour = "black", size = 3, check_overlap = TRUE) +
scale_fill_manual(values = rev(colorRampPalette(brewer.pal(9, "YlGnBu"))(54))) +
coord_sf(expand = FALSE) +
theme(legend.position = "none") +
ggtitle(paste("Longhurst Biogeochemical Provinces -",
length(unique(longhurst$ProvCode)),"provinces"))
longhurst_map
At the first level of reduction, Longhurst recognized four principal biomes (also referred to as domains in earlier publications): the Polar biome, the Westerlies biome, the Trade-Winds biome, and the Coastal Boundary Zone biome. These four biomes are recognizable in every major ocean basin. At the next level of reduction, the ocean basins are partitioned into provinces, roughly ten for each basin. These partitions provide a template for data analysis or for making parameter assignments on a global scale.
Add a column for a short biome or domain name.
longhurst$biome = str_split(longhurst$ProvDescr, pattern = " - ", simplify = TRUE)[,1]
# set order
longhurst$biome = factor(longhurst$biome , levels = c("Polar", "Westerlies", "Trades", "Coastal"))
Draw the world map with the biomes
biomes_map = map +
geom_sf(data = longhurst, aes(fill = biome), size = .1, col = "white", alpha = .5) +
theme(legend.position = "bottom", legend.direction = "horizontal") +
scale_fill_brewer(palette = "RdBu", direction = -1) +
coord_sf(expand = FALSE) +
labs(x = 'Longitude', y = 'Latitude', fill = "Biome") +
ggtitle("Biomes associated with Longhurst Biogeochemical Provinces")
biomes_map
legend_biome <- get_legend(biomes_map)
plot_grid(longhurst_map, biomes_map + theme(legend.position = "none"), NULL, legend_biome,
labels = c("A", "B", "", ""), label_size = 12,
ncol = 1, align = 'vh', axis = 'l',
rel_heights = c(1,1,-.2,0.5),
rel_widths = c(1,1,1,0.5))
ggsave(
"fig-s1.pdf",
plot = last_plot(),
device = "pdf",
scale = 1,
width = 6,
height = 7.5,
dpi = 300
)
biomes_map +
geom_point(aes(
x = meta$longitude,
y = meta$latitude), shape = 3) +
geom_text_repel(aes(
x = meta$longitude,
y = meta$latitude,
label=meta$station_name),
size = 2
)
meta %>%
group_by(biome) %>%
select(biome, temperature) %>%
summarise_at(vars(temperature), list(temp = mean))
## # A tibble: 4 × 2
## biome temp
## <chr> <dbl>
## 1 Coastal 21.8
## 2 Polar 2.68
## 3 Trade wind 25.1
## 4 Westerly 19.4
meta %>%
group_by(biome) %>%
select(biome, salinity) %>%
summarise_at(vars(salinity), list(sali = mean))
## # A tibble: 4 × 2
## biome sali
## <chr> <dbl>
## 1 Coastal 35.4
## 2 Polar 33.0
## 3 Trade wind 35.3
## 4 Westerly 36.7
meta %>%
group_by(biome) %>%
select(biome, nitrate) %>%
summarise_at(vars(nitrate), list(nit = mean), na.rm = TRUE)
## # A tibble: 4 × 2
## biome nit
## <chr> <dbl>
## 1 Coastal 2.48
## 2 Polar 5.82
## 3 Trade wind 0.838
## 4 Westerly 0.861
meta %>%
group_by(biome) %>%
select(biome, chlorophyll) %>%
summarise_at(vars(chlorophyll), list(chloro_mean = mean, chloro_sd = sd), na.rm = TRUE)
## # A tibble: 4 × 3
## biome chloro_mean chloro_sd
## <chr> <dbl> <dbl>
## 1 Coastal 0.277 0.415
## 2 Polar 1.32 2.14
## 3 Trade wind 0.137 0.119
## 4 Westerly 0.193 0.212
gtdb_info = read.delim("/Users/jegoussc/Repositories/hanoxy/data/info/haliea-gtdb.tsv")
genome_info = gtdb_info %>% separate(data = ., col = ncbi_taxo,
into = c('ncbi_domain', 'ncbi_phylum', 'ncbi_class', 'ncbi_order', 'ncbi_family', 'ncbi_genus', 'ncbi_species'),
sep = ";") %>%
separate(data = ., col = gtdb_taxo, into = c('domain', 'gtdb_phylum', 'gtdb_class', 'gtdb_order', 'gtdb_family', 'gtdb_genus', 'gtdb_species'), sep = ";") %>%
mutate(gtdb_species = substr(gtdb_species,5,nchar(gtdb_species)),
gtdb_genus = substr(gtdb_genus,5,nchar(gtdb_genus))) %>%
dplyr::select(id, ncbi_organism_name, gtdb_genus, gtdb_species)
Count data computed by CoverM,
which uses the mapping algorythm minimap2. The count data
is the number of reads aligned toq each genome. Note that a single read
may be aligned to multiple genomes with supplementary alignments.
Locate the files
count_dir <- "/Users/jegoussc/Repositories/hanoxy/results/counts"
count_files <- list.files(path = count_dir, pattern = 'tsv',
full.names = TRUE)
Read count data files
counts <- NA
for (f in count_files) {
# print(f)
r <- read.delim(file = f, header = TRUE,
row.names = 1)
counts = cbind(counts, r)
}
Rename columns
colnames(counts) <- substr(colnames(counts), 1, 8)
rownames(counts) <- substr(rownames(counts), 1, 15)
Inspect the data frame
Cleanup the data frame
counts = counts %>%
select(-counts) # remove the empty counts column
Melt dataset
mcounts = counts %>%
t() %>% # transpose
melt() # melt
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
# rename the columns
colnames(mcounts) = c("station_name", "genome", "count")
# inspect the resulting dataframe
mcounts %>%
head()
## station_name genome count
## 1 MIME_001 GCA_000423125.1 1567
## 2 MIME_002 GCA_000423125.1 592
## 3 TARA_030 GCA_000423125.1 15917
## 4 TARA_031 GCA_000423125.1 10027
## 5 TARA_032 GCA_000423125.1 7349
## 6 TARA_033 GCA_000423125.1 5758
Explore distribution
hist(mcounts$count, breaks = 100,
main = "Histogram of counts", xlab = "Counts")
mcts_genus = counts %>%
mutate(id = rownames(counts)) %>%
left_join(., genome_info, by = 'id') %>%
group_by(gtdb_genus) %>%
summarise(across(starts_with(c("TARA", "MIME")), mean)) %>%
select(gtdb_genus, starts_with(c("TARA", "MIME"))) %>%
as.data.frame() %>%
filter(!is.na(gtdb_genus)) %>%
melt()
## Using gtdb_genus as id variables
colnames(mcts_genus) = c("gtdb_genus", "station_name", "count")
mcts_genus$count = round(mcts_genus$count, digits = 0)
mcts_genus = left_join(mcts_genus, meta, by = "station_name")
Scale count data to library size to know the proportions of metagenomic reads recruited by the Halieaceae genomes in percentages.
perc = counts
for (s in colnames(counts)) {
perc[,s] = counts[,s]/meta[meta$station_name == s,]$filtered_reads * 100
}
Per genus
mperc_genus = perc %>%
round(., digits = 3) %>%
mutate(id = rownames(.)) %>%
left_join(., genome_info, by = 'id') %>%
group_by(gtdb_genus) %>%
summarise(across(where(is.numeric), sum)) %>%
as.data.frame() %>%
melt()
## Using gtdb_genus as id variables
colnames(mperc_genus) = c("genus", "station_name", "abundance")
Mean abundance per genus
mperc_genus %>% select(genus, abundance) %>% group_by(genus) %>%
summarise(across(where(is.numeric), c(mean = mean, sd = sd))) %>%
arrange(abundance_mean)
## # A tibble: 13 × 3
## genus abundance_mean abundance_sd
## <chr> <dbl> <dbl>
## 1 Halioglobus_A 0.0024 0.00191
## 2 Chromatocurvus 0.00552 0.00142
## 3 NZNC01 0.00688 0.0147
## 4 Pseudohaliea 0.00814 0.00313
## 5 GCA-2748505 0.00925 0.0638
## 6 Aequoribacter 0.00962 0.00449
## 7 CABYJX01 0.0101 0.00388
## 8 Kineobactrum 0.0135 0.0244
## 9 Haliea 0.0196 0.0106
## 10 Congregibacter 0.0217 0.00819
## 11 Parahaliea 0.0296 0.0129
## 12 Halioglobus 0.139 0.0883
## 13 Luminiphilus 0.422 0.345
Per species
mperc_species = perc %>%
round(., digits = 3) %>%
mutate(id = rownames(.)) %>%
left_join(., genome_info, by = 'id') %>%
group_by(gtdb_species) %>%
summarise(across(where(is.numeric), sum)) %>%
as.data.frame() %>%
melt()
## Using gtdb_species as id variables
colnames(mperc_species) = c("species", "station_name", "abundance")
mperc_species %>% select(species, abundance) %>%
group_by(species) %>%
summarise(across(where(is.numeric), mean)) %>%
arrange(abundance)
## # A tibble: 69 × 2
## species abundance
## <chr> <dbl>
## 1 Congregibacter sp002428935 0.00131
## 2 Halioglobus sp009937575 0.00157
## 3 Halioglobus sp013214285 0.00169
## 4 Luminiphilus sp002390485 0.00171
## 5 Luminiphilus sp008081075 0.0018
## 6 Luminiphilus sp002389505 0.00205
## 7 Haliea sp002414665 0.0022
## 8 Halioglobus_A pacificus 0.0024
## 9 Luminiphilus sp013911345 0.00243
## 10 Luminiphilus sp009886815 0.00263
## # … with 59 more rows
Merge the percentages of metagenomic reads recruited by the Halieaceae genomes.
mperc_genus = merge(mperc_genus, meta)
mperc_species = merge(mperc_species, meta)
Percentage per biome
mperc_species %>%
group_by(biome) %>%
summarise_at(., vars(abundance), c(mean, min, max, sd)) %>%
select(fn1) %>% mutate(fn1 = fn1 *100) %>% round(., digits = 1)
## # A tibble: 4 × 1
## fn1
## <dbl>
## 1 1.4
## 2 0.7
## 3 1
## 4 1
ggplot(mperc_genus, aes(x=abundance, fill = genus, colour = genus)) +
geom_density(alpha = .5) +
facet_grid(genus~., margins = TRUE, scales = "free") +
theme_minimal() +
scale_fill_manual(values = colorRampPalette(brewer.pal(8, "Spectral"))(14)) +
scale_colour_manual(values = colorRampPalette(brewer.pal(8, "Spectral"))(14))
ggsave(
"fig-s2.pdf",
plot = last_plot(),
device = "pdf",
scale = 1,
width = 8,
height = 9,
dpi = 300
)
dat = mperc_species %>%
select(station_name, abundance) %>%
group_by(station_name) %>%
summarise(across(where(is.numeric), sum)) %>%
as.data.frame() %>%
left_join(., meta)
## Joining, by = "station_name"
map +
geom_point(aes(
x = dat$longitude,
y = dat$latitude,
size = dat$abundance,
colour = dat$biome),
alpha = .75) +
scale_colour_manual(values = c(brewer.pal(4, "RdBu")[1], brewer.pal(4, "RdBu")[4], brewer.pal(4, "RdBu")[2], brewer.pal(4, "RdBu")[3])) +
labs(#title = "Global distribution of the Halieaceae family",
colour = "Biome", size = "Abundance (%)",
#caption = "Percentages of metagenomics reads mapped to 69 de-replicated Halieaceae genomes\nat sampling stations representing the four marine biomes."
)
ggsave(
"fig-2.pdf",
plot = last_plot(),
device = "pdf",
scale = 1,
width = 8,
height = 4,
dpi = 300
)
This is an alternative to the scaling. Indeed, DESeq needs un-normalised count data as input.
http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#the-deseqdataset
Merge count data with genome info
cts_genus = counts %>%
mutate(id = rownames(counts)) %>%
left_join(., genome_info, by = 'id') %>%
group_by(gtdb_genus) %>%
summarise(across(starts_with(c("TARA", "MIME")), mean)) %>%
select(gtdb_genus, starts_with(c("TARA", "MIME"))) %>%
as.data.frame() %>%
filter(!is.na(gtdb_genus))
rownames(cts_genus) = cts_genus$gtdb_genus
cts_genus = cts_genus %>% select(starts_with(c("TARA", "MIME")))
cts_genus = round(cts_genus, digits = 0)
coldata = meta %>% filter(station_name %in% colnames(cts_genus))
rownames(coldata) = coldata$station_name
coldata = coldata %>%
mutate(condition = ifelse(biome == "Polar", "Polar", "Non Polar"))
all(rownames(coldata) == colnames(cts_genus))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData = cts_genus,
colData = coldata,
design = ~ biome) # biome or condition?
dds
## class: DESeqDataSet
## dim: 13 65
## metadata(1): version
## assays(1): counts
## rownames(13): Aequoribacter CABYJX01 ... Parahaliea Pseudohaliea
## rowData names(0):
## colnames(65): TARA_030 TARA_031 ... MIME_001 MIME_002
## colData names(20): station_name sampling_date ... season condition
featureData <- data.frame(genome=rownames(cts_genus))
mcols(dds) <- DataFrame(mcols(dds), featureData)
mcols(dds)
## DataFrame with 13 rows and 1 column
## genome
## <character>
## Aequoribacter Aequoribacter
## CABYJX01 CABYJX01
## Chromatocurvus Chromatocurvus
## Congregibacter Congregibacter
## GCA-2748505 GCA-2748505
## ... ...
## Kineobactrum Kineobactrum
## Luminiphilus Luminiphilus
## NZNC01 NZNC01
## Parahaliea Parahaliea
## Pseudohaliea Pseudohaliea
Apply methodology by Anders et al. (2010) https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106#Sec2
# calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans = apply(counts(dds), 1, gm_mean)
dds = estimateSizeFactors(dds, geoMeans = geoMeans)
Differential abundances analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution with Wald test
dds <- DESeq(dds)
wtres <- results(dds)
Likelihood ratio test (chi-squared test) for GLMs to test for significance of change in deviance between a full (~biome) and reduced model.
dds <- estimateDispersionsGeneEst(dds)
## found already estimated gene-wise dispersions, removing these
dispersions(dds) <- mcols(dds)$dispGeneEst
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
dds <- nbinomLRT(dds, reduced = ~ 1)
## found results columns, replacing these
lrtres <- results(dds)
Select the species that differentially present between the conditions.
differential_genera = res %>% as.data.frame() %>%
filter(padj < 0.05) %>%
rownames()
differential_genera
## [1] "Aequoribacter" "CABYJX01" "Chromatocurvus" "Congregibacter"
## [5] "Halioglobus" "Luminiphilus" "Pseudohaliea"
Extract the normalised counts
ncts_genus = round(assay(normTransform(dds)), digits = 0)
Melt the normalised counts
mncts_genus = ncts_genus %>%
t() %>% # transpose
melt() # melt
# rename the columns
colnames(mncts_genus) = c("station_name", "genus", "count")
Possibility to inspect the resulting dataframe
Prepare the matrix to plot the heatmap; here we subselect the differential species.
ordered_station_names = meta %>%
filter(station_name %in% colnames(ncts_genus)) %>%
arrange(biome, ocean, station_name) %>%
select(station_name)
#matrix = ncts[rownames(ncts) %in% differential_species,] %>% unlist() %>%
# as.data.frame() %>%
# select(ordered_station_names$station_name)
matrix = ncts_genus %>% unlist() %>%
as.data.frame() %>%
select(ordered_station_names$station_name)
Set categorical variables and graphic parameters.
# significance levels
anno_sign = res %>% as.data.frame() %>%
select(padj) %>%
mutate(significance = gtools::stars.pval(padj)) %>%
select(significance) %>%
mutate(significance = replace(significance, significance %in% c(NA, ".", "", " "), "NS"))
anno <- as.data.frame(colData(dds)[,c("biome", "ocean", "season")])
# colour palette for the biomes
biome_colours <- brewer.pal(4, "RdBu")
names(biome_colours) <- unique(anno$biome)
# colour palette for the oceans
ocean_colours <- brewer.pal(6, "YlGnBu")[2:6]
names(ocean_colours) <- unique(anno$ocean)
season_colours = brewer.pal(4, "Pastel2")
names(season_colours) <- unique(anno$season)
significance_colours = c(brewer.pal(4, "Reds"))
names(significance_colours) <- c("NS", "*", "**", "***")
annoCol <- list(ocean = ocean_colours,
biome = biome_colours,
season = season_colours,
significance =significance_colours)
Check optimal number of clusters
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
df = scale(matrix)
# Elbow method
fviz_nbclust(df, kmeans, method = "wss") + #c("silhouette", "wss", "gap_stat"),
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
Draw the heatmap
pheatmap(matrix,
color = colorRampPalette(rev(brewer.pal(n = 7, name = "Spectral")))(100),
cluster_rows = TRUE,
show_rownames = TRUE,
show_colnames = TRUE,
cluster_cols = FALSE,
fontsize_row = 7,
fontsize_col = 5,
annotation_col = anno,
annotation_row = anno_sign,
border_color = "white",
clustering_method = "complete", # "ward.D"
annotation_colors = annoCol,
cutree_rows = 4,
treeheight_row = 20,
gaps_col = c(12, 29, 54),
#main = "Global distribution of Halieaceae genera."
) %>% ggplotify::as.ggplot()
ggsave(
"fig-s3.pdf",
plot = last_plot(),
device = "pdf",
scale = 1,
width = 8,
height = 5,
dpi = 300
)
This is an alternative to the scaling. Indeed, DESeq needs un-normalised count data as input.
http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#the-deseqdataset
Merge count data with genome info
cts_species = counts %>%
mutate(id = rownames(counts)) %>%
left_join(., genome_info, by = 'id') %>%
group_by(gtdb_species) %>%
summarise(across(starts_with(c("TARA","MIME")), mean)) %>%
select(gtdb_species, starts_with(c("TARA","MIME"))) %>%
as.data.frame() %>%
filter(!is.na(gtdb_species))
rownames(cts_species) = cts_species$gtdb_species
cts_species = cts_species %>% select(starts_with(c("TARA","MIME")))
cts_species = round(cts_species, digits = 0)
coldata = meta %>% filter(station_name %in% colnames(cts_species))
rownames(coldata) = coldata$station_name
coldata = coldata %>%
mutate(condition = ifelse(biome == "Polar", "Polar", "Non Polar"))
all(rownames(coldata) == colnames(cts_species))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData = cts_species,
colData = coldata,
design = ~ biome) # biome or condition?
dds
## class: DESeqDataSet
## dim: 69 65
## metadata(1): version
## assays(1): counts
## rownames(69): Aequoribacter fuscus Aequoribacter sp003520285 ...
## Parahaliea sp014077625 Pseudohaliea rubra
## rowData names(0):
## colnames(65): TARA_030 TARA_031 ... MIME_001 MIME_002
## colData names(20): station_name sampling_date ... season condition
featureData <- data.frame(genome=rownames(cts_species))
mcols(dds) <- DataFrame(mcols(dds), featureData)
mcols(dds)
## DataFrame with 69 rows and 1 column
## genome
## <character>
## Aequoribacter fuscus Aequoribacter fuscus
## Aequoribacter sp003520285 Aequoribacter sp0035..
## CABYJX01 sp902519405 CABYJX01 sp902519405
## Chromatocurvus halotolerans Chromatocurvus halot..
## Congregibacter litoralis Congregibacter litor..
## ... ...
## Parahaliea aestuarii Parahaliea aestuarii
## Parahaliea aestuarii_A Parahaliea aestuarii_A
## Parahaliea mediterranea Parahaliea mediterra..
## Parahaliea sp014077625 Parahaliea sp014077625
## Pseudohaliea rubra Pseudohaliea rubra
Extract the normalised counts
ncts_species = round(assay(normTransform(dds)), digits = 0)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
Melt the normalised counts
mncts_species = ncts_species %>%
t() %>% # transpose
melt() # melt
# rename the columns
colnames(mncts_species) = c("station_name", "species", "count")
Mean of species count
summary(t(ncts_species))[4,] %>% sort()
## GCA-2748505 sp002748505 Luminiphilus sp002390485
## "Mean :10.63 " "Mean :10.68 "
## Luminiphilus sp008081075 Congregibacter sp002428935
## "Mean :10.75 " "Mean :10.92 "
## Halioglobus sp009937575 Luminiphilus sp013911345
## "Mean :11.02 " "Mean :11.08 "
## Luminiphilus sp002389505 Haliea sp002414665
## "Mean :11.17 " "Mean :11.2 "
## Luminiphilus sp009886815 Halioglobus sp013214285
## "Mean :11.22 " "Mean :11.29 "
## Luminiphilus sp002691565 Halioglobus sp002862985
## "Mean :11.48 " "Mean :11.52 "
## Luminiphilus sp002469785 Luminiphilus sp002683335
## "Mean :11.6 " "Mean :11.62 "
## Luminiphilus sp002683395 Halioglobus_A pacificus
## "Mean :11.62 " "Mean :11.66 "
## Luminiphilus sp002456975 Luminiphilus sp002689915
## "Mean :11.72 " "Mean :11.75 "
## Luminiphilus sp003331335 Luminiphilus sp012270045
## "Mean :11.78 " "Mean :11.88 "
## Aequoribacter sp003520285 Haliea sp002416125
## "Mean :11.97 " "Mean :12.15 "
## Luminiphilus sp002377985 Halioglobus sp002699145
## "Mean :12.17 " "Mean :12.18 "
## Halioglobus sp003646405 Halioglobus sp002862465
## "Mean :12.22 " "Mean :12.28 "
## Luminiphilus sp003331685 Halioglobus sp008370495
## "Mean :12.28 " "Mean :12.35 "
## Parahaliea aestuarii_A Halioglobus sp004354085
## "Mean :12.38 " "Mean :12.42 "
## Luminiphilus sp002703585 Luminiphilus sp009937065
## "Mean :12.45 " "Mean :12.55 "
## Parahaliea aestuarii Kineobactrum sediminis
## "Mean :12.66 " "Mean :12.82 "
## Luminiphilus syltensis Luminiphilus sp002705465
## "Mean :12.88 " "Mean :12.91 "
## Halioglobus sediminis NZNC01 sp002692965
## "Mean :12.92 " "Mean :12.92 "
## Chromatocurvus halotolerans Luminiphilus sp002721785
## "Mean :12.98 " "Mean :13.02 "
## Kineobactrum sp010669285 Luminiphilus sp902613175
## "Mean :13.08 " "Mean :13.08 "
## Haliea alexandrii Halioglobus maricola
## "Mean :13.11 " "Mean :13.14 "
## Halioglobus sp004745945 Halioglobus sp003646415
## "Mean :13.15 " "Mean :13.17 "
## Luminiphilus sp011523115 Luminiphilus sp003523185
## "Mean :13.23 " "Mean :13.26 "
## Aequoribacter fuscus Halioglobus lutimaris
## "Mean :13.31 " "Mean :13.32 "
## Halioglobus sp000156295 Parahaliea mediterranea
## "Mean :13.38 " "Mean :13.4 "
## Haliea salexigens Luminiphilus sp902631395
## "Mean :13.43 " "Mean :13.43 "
## Luminiphilus sp902547755 Pseudohaliea rubra
## "Mean :13.46 " "Mean :13.63 "
## Halioglobus japonicus_A Luminiphilus sp002862405
## "Mean :13.66 " "Mean :13.69 "
## Congregibacter litoralis CABYJX01 sp902519405
## "Mean :13.85 " "Mean :13.91 "
## Congregibacter sp000158155 Luminiphilus sp000169115
## "Mean :13.94 " "Mean :13.97 "
## Luminiphilus sp002696455 Parahaliea sp014077625
## "Mean :14.02 " "Mean :14.15 "
## Luminiphilus sp000227505 Luminiphilus sp902518075
## "Mean :14.34 " "Mean :14.63 "
## Luminiphilus sp902520195 Luminiphilus sp902623175
## "Mean :15.08 " "Mean :15.22 "
## Halioglobus japonicus
## "Mean :15.91 "
Possibility to inspect the resulting data frame
mncts_species %>%
head()
left_join(mncts_species, meta) %>%
group_by(biome, species) %>%
summarise(across(where(is.numeric), c(mean = mean, sd = sd))) %>%
arrange(count_mean)
left_join(mncts_species, meta) %>%
arrange(count) %>%
ggplot(., aes(x = species, y = count)) + geom_boxplot() + facet_wrap(~biome, ncol =1 ) + theme(axis.text.x = element_text(angle = 90))
Apply methodology by Anders et al. (2010) https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106#Sec2
# calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans = apply(counts(dds), 1, gm_mean)
dds = estimateSizeFactors(dds, geoMeans = geoMeans)
Differential abundances analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution with Wald test
dds <- DESeq(dds)
wtres <- results(dds)
Likelihood ratio test (chi-squared test) for GLMs to test for significance of change in deviance between a full (~biome) and reduced model.
dds <- estimateDispersionsGeneEst(dds)
## found already estimated gene-wise dispersions, removing these
dispersions(dds) <- mcols(dds)$dispGeneEst
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
dds <- nbinomLRT(dds, reduced = ~ 1)
## found results columns, replacing these
lrtres <- results(dds)
Select the species that differentially present between the conditions.
differential_species = res %>% as.data.frame() %>%
filter(padj < 0.05) %>%
rownames()
differential_species
## [1] "Aequoribacter fuscus" "Aequoribacter sp003520285"
## [3] "CABYJX01 sp902519405" "Congregibacter litoralis"
## [5] "Congregibacter sp000158155" "Congregibacter sp002428935"
## [7] "Haliea alexandrii" "Haliea salexigens"
## [9] "Halioglobus japonicus" "Halioglobus japonicus_A"
## [11] "Halioglobus lutimaris" "Halioglobus maricola"
## [13] "Halioglobus sp000156295" "Halioglobus sp002699145"
## [15] "Halioglobus sp002862465" "Halioglobus sp002862985"
## [17] "Halioglobus sp004354085" "Halioglobus sp009937575"
## [19] "Halioglobus sp013214285" "Halioglobus_A pacificus"
## [21] "Kineobactrum sp010669285" "Luminiphilus sp000227505"
## [23] "Luminiphilus sp002377985" "Luminiphilus sp002389505"
## [25] "Luminiphilus sp002390485" "Luminiphilus sp002683335"
## [27] "Luminiphilus sp002683395" "Luminiphilus sp002696455"
## [29] "Luminiphilus sp002703585" "Luminiphilus sp002705465"
## [31] "Luminiphilus sp002721785" "Luminiphilus sp003331685"
## [33] "Luminiphilus sp003523185" "Luminiphilus sp008081075"
## [35] "Luminiphilus sp009886815" "Luminiphilus sp011523115"
## [37] "Luminiphilus sp012270045" "Luminiphilus sp013911345"
## [39] "Luminiphilus sp902518075" "Luminiphilus sp902520195"
## [41] "Luminiphilus sp902547755" "Luminiphilus sp902613175"
## [43] "Luminiphilus sp902623175" "Luminiphilus sp902631395"
## [45] "Parahaliea sp014077625"
Prepare the matrix to plot the heatmap; here we subselect the differential species.
ordered_station_names = meta %>%
filter(station_name %in% colnames(ncts_species)) %>%
arrange(biome, ocean, station_name) %>%
select(station_name)
#matrix = ncts[rownames(ncts) %in% differential_species,] %>% unlist() %>%
# as.data.frame() %>%
# select(ordered_station_names$station_name)
matrix = ncts_species %>% unlist() %>%
as.data.frame() %>%
select(ordered_station_names$station_name)
Check number of clusters
library(factoextra)
library(NbClust)
df = scale(matrix)
# Elbow method
fviz_nbclust(df, kmeans, method = "wss") + #c("silhouette", "wss", "gap_stat"),
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
Set categorical variables and graphic parameters.
# significance levels
anno_sign = res %>% as.data.frame() %>%
select(padj) %>%
mutate(significance = gtools::stars.pval(padj)) %>%
select(significance) %>%
mutate(significance = replace(significance, significance %in% c(NA, ".", "", " "), "NS"))
anno <- as.data.frame(colData(dds)[,c("biome", "ocean", "season")])
# colour palette for the biomes
biome_colours <- brewer.pal(4, "RdBu")
names(biome_colours) <- unique(anno$biome)
# colour palette for the oceans
ocean_colours <- brewer.pal(6, "YlGnBu")[2:6]
names(ocean_colours) <- unique(anno$ocean)
season_colours = brewer.pal(4, "Pastel2")
names(season_colours) <- unique(anno$season)
significance_colours = c(brewer.pal(4, "Reds"))
names(significance_colours) <- c("NS", "*", "**", "***")
annoCol <- list(ocean = ocean_colours,
biome = biome_colours,
season = season_colours,
significance =significance_colours)
Draw the heatmap
hm = pheatmap(matrix,
color = colorRampPalette(rev(brewer.pal(n = 7, name = "Spectral")))(100),
cluster_rows = TRUE,
show_rownames = TRUE,
show_colnames = TRUE,
cluster_cols = FALSE,
fontsize_row = 7,
annotation_col = anno,
annotation_row = anno_sign,
border_color = "white",
clustering_method = "ward.D", #ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
annotation_colors = annoCol,
cutree_rows = 4, #gaps_row = c(2,3,4,7,8,12,29,31,63,64,68),
treeheight_row = 20,
gaps_col = c(12, 29, 54),
#main = "Abundance heatmap"
)
pheatmap(matrix,
color = colorRampPalette(rev(brewer.pal(n = 7, name = "Spectral")))(100),
cluster_rows = TRUE,
show_rownames = TRUE,
show_colnames = TRUE,
cluster_cols = FALSE,
fontsize_row = 7,
fontsize_col = 5,
annotation_col = anno,
annotation_row = anno_sign,
border_color = "white",
clustering_method = "ward.D",
annotation_colors = annoCol,
cutree_rows = 4,
treeheight_row = 20,
gaps_col = c(12, 29, 54),
#main = "Abundance heatmap"
) %>% ggplotify::as.ggplot()
ggsave(
"fig-3.pdf",
plot = last_plot(),
device = "pdf",
scale = 1,
width = 8,
height = 8,
dpi = 300
)
extract clusters
clust <- cbind(matrix,
cluster = cutree(hm$tree_row,
k = 4))
ubiquitous = clust %>%
filter(cluster == 1) %>% rownames()
rare = clust %>%
filter(cluster == 2) %>% rownames()
coldtolerant = clust %>%
filter(cluster == 3) %>% rownames()
coldintolerant = clust %>%
filter(cluster == 4) %>% rownames()
#order_cluster = c(ubiquitous, rare, coldtolerant, coldintolerant)
order_cluster <- rev(rownames(matrix[c(hm$tree_row[["order"]]),]))
A negative binomial (NB) generalised linear model (GLM) was fit for each species against each of the variables in the metadata separately, using Deseq2.
A Wald test was then performed on each fit to assess the significance of the independent variable, followed by correction of the p values using the Benjamini−Hochberg (BH) method.
What genera display differential abundances depending on the temperature Fit NB GLM to the relationship between the abundances (counts) and the temperature (abundance as a function of temperature).
The Wald test can also be used with continuous variables such environmental measurements (temperature, salinity, etc.). In this case, the reported log2 fold change is per unit of change of that variable (“log2 fold change per degree C differences.”)
Wald test for the GLM coefficients
This function tests for significance of coefficients in a Negative Binomial GLM, using previously calculated sizeFactors (or normalizationFactors) and dispersion estimates.
The fitting proceeds as follows: standard maximum likelihood estimates for GLM coefficients (synonymous with “beta”, “log2 fold change”, “effect size”) are calculated.
For calculating Wald test p-values, the coefficients are scaled by their standard errors and then compared to a standard Normal distribution. The results function without any arguments will automatically perform a contrast of the last level of the last variable in the design formula over the first level. The contrast argument of the results function can be used to generate other comparisons.
Missing measurement for MIME samples; exclude these samples.
cts_species = cts_species %>% select(starts_with("TARA"))
coldata = filter(coldata, grepl("TARA",station_name))
Fit NB GLM and test for the significance of the coefficients using the Wald test.
dds <- DESeqDataSetFromMatrix(countData = cts_species,
colData = coldata,
design = ~ temperature)
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)
restemp = as.data.frame(results(dds)) %>%
mutate(variable = "temperature") %>%
add_rownames(., var = 'genus') %>%
select(genus, variable, log2FoldChange, padj) %>%
mutate(sign = gtools::stars.pval(padj)) %>%
mutate(genus = factor(genus, levels = order_cluster))
Fit NB GLM and test for the significance of the coefficients using the Wald test.
dds <- DESeqDataSetFromMatrix(countData = cts_species,
colData = coldata,
design = ~ salinity) # biome or condition?
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)
ressalinity = as.data.frame(results(dds)) %>%
mutate(variable = "salinity") %>%
add_rownames(., var = 'genus') %>%
select(genus, variable, log2FoldChange, padj) %>%
mutate(sign = gtools::stars.pval(padj)) %>%
mutate(genus = factor(genus, levels = order_cluster))
Fit NB GLM and test for the significance of the coefficients using the Wald test.
dds <- DESeqDataSetFromMatrix(countData = cts_species,
colData = coldata,
design = ~ nitrate)
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)
resnitrate = as.data.frame(results(dds)) %>%
mutate(variable = "nitrate") %>%
add_rownames(., var = 'genus') %>%
select(genus, variable, log2FoldChange, padj) %>%
mutate(sign = gtools::stars.pval(padj)) %>%
mutate(genus = factor(genus, levels = order_cluster))
Fit NB GLM and test for the significance of the coefficients using the Wald test.
dds <- DESeqDataSetFromMatrix(countData = cts_species,
colData = coldata,
design = ~ oxygen)
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)
resoxygen = as.data.frame(results(dds)) %>%
mutate(variable = "oxygen") %>%
add_rownames(., var = 'genus') %>%
select(genus, variable, log2FoldChange, padj) %>%
mutate(sign = gtools::stars.pval(padj)) %>%
mutate(genus = factor(genus, levels = order_cluster))
Fit NB GLM and test for the significance of the coefficients using the Wald test.
dds <- DESeqDataSetFromMatrix(countData = cts_species,
colData = coldata,
design = ~ chlorophyll)
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)
reschlorophyll = as.data.frame(results(dds)) %>%
mutate(variable = "chlorophyll") %>%
add_rownames(., var = 'genus') %>%
select(genus, variable, log2FoldChange, padj) %>%
mutate(sign = gtools::stars.pval(padj)) %>%
mutate(genus = factor(genus, levels = order_cluster))
Set colour scheme for each environmental variable (according to the CPK coloring of chemical elements)
coltemp = scale_fill_distiller(palette = "RdBu")
colsalinity = scale_fill_gradient2(
midpoint = 0, mid = "white",
low = brewer.pal(8,"Greys")[6],
high = brewer.pal(5,"Purples")[5])
colnitrate = scale_fill_gradient2(
midpoint = 0, mid = "white",
low = brewer.pal(8,"Greys")[6],
high = brewer.pal(5,"Blues")[5])
coloxygen = scale_fill_distiller(palette = "RdGy")
colchlorophyll = scale_fill_gradient2(
midpoint = 0, mid = "white",
low = brewer.pal(8,"Greys")[6],
high = brewer.pal(5,"Greens")[5])
Cleanup ggplot theme
cleanup = theme_void() + theme(legend.position = "bottom",
axis.text.x = element_text(size = 10),
legend.title = element_blank(),
legend.direction = "horizontal",
legend.key.size = unit(8, 'pt'),
legend.text = element_text(size = 5, angle = 45, vjust = 1),
plot.margin = margin(l = -.5, r = -.5, unit = "cm"))
Generate all plots
p1 = ggplot(restemp, aes(x = variable, y = genus, fill=log2FoldChange)) +
geom_tile(colour = "white", lwd = .5, linetype = 1) +
geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") +
coltemp + cleanup
p2 = ggplot(ressalinity, aes(x = variable, y = genus, fill=log2FoldChange)) +
geom_tile(colour = "white", lwd = .5, linetype = 1) +
geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") +
colsalinity + cleanup
p3 = ggplot(resnitrate, aes(x = variable, y = genus, fill=log2FoldChange)) +
geom_tile(colour = "white", lwd = .5, linetype = 1) +
geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") +
colnitrate + cleanup
p4 = ggplot(resoxygen, aes(x = variable, y = genus, fill=log2FoldChange)) +
geom_tile(colour = "white", lwd = .5, linetype = 1) +
geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") +
coloxygen + cleanup
p5 = ggplot(reschlorophyll, aes(x = variable, y = genus, fill=log2FoldChange)) +
geom_tile(colour = "white", lwd = .5, linetype = 1) +
geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") +
colchlorophyll +cleanup
cowplot::plot_grid(NULL,
p1 + theme(axis.text.y = element_text(size = 6, hjust = 1)), NULL,
p2, NULL, p3, NULL, p4, NULL, p5,
ncol = 10,
align = 'hv', axis = 'tb',
rel_widths = c(0.4,1,-0.4,1,-0.4,1,-0.4,1,-0.4,1)) %>%
ggplotify::as.ggplot() +
labs(#title = "Envrionmental variables correlated with Halieaceae genera abundances.",
# subtitle = "Envrionmental data were obtained from the TARA Ocean data set.\n",
#caption = "\nSignificant correlations between genera and chemo-physical properties.\n Correlations were assessed using a negative binomial GLM for each Halieaceae genera.\n Gradients represent the log2 fold change and significance level\nrepresent the pvalue from Wald-test adjusted with the Benjamini−Hochberg procedure."
)
ggsave(
"fig-4.pdf",
plot = last_plot(),
device = "pdf",
scale = 0.9,
width = 10,
height = 10,
dpi = 300
)
https://swilke-geoscience.net/post/spatial_interpolation/
data = mperc_genus #%>%
#filter(species == "Halioglobus sp009937575")
map +
geom_point(aes(
x = data$longitude,
y = data$latitude,
size = data$abundance,
colour = data$genus))
Chi square
temptab = data.frame(
S = c(10,14,4,10), # number of species with significant correlation with temp in each cluster
NS = c(18,13,0,0) # number of species with NS correlation with temp in each cluster
)
chisq.test(temptab, correct=TRUE)
## Warning in chisq.test(temptab, correct = TRUE): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: temptab
## X-squared = 15.775, df = 3, p-value = 0.001261
salinitytab = data.frame(
S = c(1,3,0,0),
NS = c(27,24,4,10)
)
chisq.test(salinitytab, correct=TRUE)
## Warning in chisq.test(salinitytab, correct = TRUE): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: salinitytab
## X-squared = 2.5117, df = 3, p-value = 0.4732
nitratetab = data.frame(
S = c(1,3,0,0),
NS = c(27,24,4,10)
)
S = c(); NS = c()
for (i in 1:4) {
cc = clust %>%
filter(cluster == i) %>% rownames()
S = append(S,
resnitrate %>% filter(genus %in% cc) %>%
filter(padj <= 0.001) %>%
summarise(n = n()) %>% pull())
NS = append(NS,
resnitrate %>% filter(genus %in% cc) %>%
filter(padj > 0.001) %>%
summarise(n = n()) %>% pull())
}
tabnitrate = data.frame(S, NS)